import time

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/97.0.4692.71 Safari/537.36 ",
    "Cookie": "BAIDU_SSP_lcr=https://www.baidu.com/link?url=HRrLFmvpqSrlGZzg1JsRFocKbpponOFurkeWl7Dm6d6eiBYGM7l"
              "-rtWBo5Bc3QVb&wd=&eqid=bb8a063a00274e770000000262373d46; "
              "UM_distinctid=17fa7c77a217a1-0f84f77f9c1a8b-f791b31-1fa400-17fa7c77a22c7f; "
              "CNZZDATA1275796416=646152846-1647783060-https%253A%252F%252Fwww.baidu.com%252F%7C1647783060; "
              "Hm_lvt_ab6a683aa97a52202eab5b3a9042a8d2=1647787342; Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1647787427 "
}
headers1 = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/97.0.4692.71 Safari/537.36 ",
    "Cookie": "UM_distinctid=17fa7c77a217a1-0f84f77f9c1a8b-f791b31-1fa400-17fa7c77a22c7f; "
              "CNZZDATA1275796416=646152846-1647783060-https%253A%252F%252Fwww.baidu.com%252F%7C1647783060; "
              "Hm_lvt_ab6a683aa97a52202eab5b3a9042a8d2=1647787342; Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1647787427 "
}

headers2 = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/97.0.4692.71 Safari/537.36 "
}


if __name__ == "__main__":

    dates = ['201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811'
        , '201812', '201901', '201902', '201903', '201904']

    data = []

    for date in dates:

        print("fetch weather date: " + date)

        url = "http://lishi.tianqi.com/hefei/" + date + ".html"
        s = requests.session()
        response = s.get(url, headers=headers2, verify=False)

        soup = BeautifulSoup(response.text, 'html.parser', from_encoding='utf8')
        nodes = soup.findAll('ul', class_="thrui")[0]

        for node in nodes:
            if str(type(node)) == "<class 'bs4.element.Tag'>" and (
                    node.attrs == {} or node.attrs['class'] != ['lishidesc2']):
                row = []
                for child in node.children:
                    if child.get_text() != '\n' and child.get_text() != '':
                        row.append(child.get_text())
                data.append(row)

        time.sleep(2)

    # s = requests.session()
    # response = s.get("http://lishi.tianqi.com/beijing/202202.html", headers=headers2, verify=False)
    # soup = BeautifulSoup(response.text, 'html.parser', from_encoding='utf8')
    #
    # nodes = soup.findAll('ul',class_="thrui")[0]
    # data = []
    # for node in nodes:
    #
    #     if str(type(node)) == "<class 'bs4.element.Tag'>" and (node.attrs == {} or node.attrs['class'] != ['lishidesc2']):
    #         row = []
    #         for child in node.children:
    #             if child.get_text() != '\n' and child.get_text() != '':
    #                 row.append(child.get_text())
    #         data.append(row)

    df = pd.DataFrame(data,columns=['date', 'max temperature', 'min temperature', 'weather', 'wind'])
    df.to_csv('./data_weather/hefei_weather.csv')
    print("hefei weather complete!")